Hotel booking Dataset

1 load package

Code
import os
#os.system('pip install xgboost')
Code
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split

2 data

2.1 download data

https://www.kaggle.com/c/titanic/data

Code
import pandas as pd
url='https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv'
hotels=pd.read_csv(url)
Code
hotels.head()
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03

5 rows × 32 columns

2.2 data EDA

Code
hotels.describe()
is_canceled lead_time arrival_date_year arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled booking_changes agent company days_in_waiting_list adr required_car_parking_spaces total_of_special_requests
count 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119386.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 103050.000000 6797.000000 119390.000000 119390.000000 119390.000000 119390.000000
mean 0.370416 104.011416 2016.156554 27.165173 15.798241 0.927599 2.500302 1.856403 0.103890 0.007949 0.031912 0.087118 0.137097 0.221124 86.693382 189.266735 2.321149 101.831122 0.062518 0.571363
std 0.482918 106.863097 0.707476 13.605138 8.780829 0.998613 1.908286 0.579261 0.398561 0.097436 0.175767 0.844336 1.497437 0.652306 110.774548 131.655015 17.594721 50.535790 0.245291 0.792798
min 0.000000 0.000000 2015.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 6.000000 0.000000 -6.380000 0.000000 0.000000
25% 0.000000 18.000000 2016.000000 16.000000 8.000000 0.000000 1.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.000000 62.000000 0.000000 69.290000 0.000000 0.000000
50% 0.000000 69.000000 2016.000000 28.000000 16.000000 1.000000 2.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 179.000000 0.000000 94.575000 0.000000 0.000000
75% 1.000000 160.000000 2017.000000 38.000000 23.000000 2.000000 3.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 229.000000 270.000000 0.000000 126.000000 0.000000 1.000000
max 1.000000 737.000000 2017.000000 53.000000 31.000000 19.000000 50.000000 55.000000 10.000000 10.000000 1.000000 26.000000 72.000000 21.000000 535.000000 543.000000 391.000000 5400.000000 8.000000 5.000000
Code
hotels.describe(include=[object])
hotel arrival_date_month meal country market_segment distribution_channel reserved_room_type assigned_room_type deposit_type customer_type reservation_status reservation_status_date
count 119390 119390 119390 118902 119390 119390 119390 119390 119390 119390 119390 119390
unique 2 12 5 177 8 5 10 12 3 4 3 926
top City Hotel August BB PRT Online TA TA/TO A A No Deposit Transient Check-Out 2015-10-21
freq 79330 13877 92310 48590 56477 97870 85994 74053 104641 89613 75166 1461

Missing Data

Code
hotels.isnull().sum()
hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company                           112593
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
reservation_status_date                0
dtype: int64
Code
#import math
#hotels=hotels>> filter(math.isnan(_.children)==False)
from siuba.siu import call
from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *

hotels >> group_by(_.children)  >> summarize(n = _.shape[0])
children n
0 0.0 110796
1 1.0 4861
2 2.0 3652
3 3.0 76
4 10.0 1
5 NaN 4
Code
#import math
hotels=hotels>>mutate(children=if_else(_.children > 0, True, False))

# Create a boolean mask and apply it
mask = pd.notna(hotels['children'])
hotels = hotels[mask]
Code
import sweetviz as sv
my_report = sv.analyze(hotels)
Code
my_report.show_notebook()

2.3 feature vs target

Code
my_report2 = sv.analyze(hotels,target_feat='children')
Code
my_report2.show_notebook()

2.4 compare train data and test data

Code
#compare = sv.compare(source=df_train, compare=df_test)
Code
#compare.show_notebook()

2.5 data dictionary

3 reference: